{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "7eb35810",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "scikit-learn: 1.0.2\n",
      "\n"
     ]
    }
   ],
   "source": [
    "%load_ext watermark\n",
    "%watermark -p scikit-learn"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "52e2aac8",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Working with Heterogeneous Datasets -- Datasets with Numerical and Categorical Columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e303e954",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "38616735",
   "metadata": {},
   "source": [
    "- Suppose you have a dataset that has both numerical and categorical features as follows: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "60e49330",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SepalLength[cm]</th>\n",
       "      <th>SepalWidth[cm]</th>\n",
       "      <th>PetalLength[cm]</th>\n",
       "      <th>PetalWidth[cm]</th>\n",
       "      <th>Color_IMadeThisUp</th>\n",
       "      <th>Species</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5.1</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>red</td>\n",
       "      <td>Iris-setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>red</td>\n",
       "      <td>Iris-setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.7</td>\n",
       "      <td>3.2</td>\n",
       "      <td>1.3</td>\n",
       "      <td>0.2</td>\n",
       "      <td>red</td>\n",
       "      <td>Iris-setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4.6</td>\n",
       "      <td>3.1</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0.2</td>\n",
       "      <td>red</td>\n",
       "      <td>Iris-setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>red</td>\n",
       "      <td>Iris-setosa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    SepalLength[cm]  SepalWidth[cm]  PetalLength[cm]  PetalWidth[cm]  \\\n",
       "Id                                                                     \n",
       "1               5.1             3.5              1.4             0.2   \n",
       "2               4.9             3.0              1.4             0.2   \n",
       "3               4.7             3.2              1.3             0.2   \n",
       "4               4.6             3.1              1.5             0.2   \n",
       "5               5.0             3.6              1.4             0.2   \n",
       "\n",
       "   Color_IMadeThisUp      Species  \n",
       "Id                                 \n",
       "1                red  Iris-setosa  \n",
       "2                red  Iris-setosa  \n",
       "3                red  Iris-setosa  \n",
       "4                red  Iris-setosa  \n",
       "5                red  Iris-setosa  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('data/iris_mod.csv', index_col='Id')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "71d61060",
   "metadata": {},
   "source": [
    "- As usual, we first tranform the class labels into an integer format:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "33591e9e",
   "metadata": {},
   "outputs": [],
   "source": [
    "X = df.drop('Species', axis=1)\n",
    "y = df['Species']\n",
    "\n",
    "label_dict = {'Iris-setosa': 0,\n",
    "              'Iris-versicolor': 1,\n",
    "              'Iris-virginica': 2}\n",
    "\n",
    "y = y.map(label_dict)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e65d56c2",
   "metadata": {},
   "source": [
    "- Next, we are going to set up a `Pipeline` that performs certain preprocessing steps only on the numerical features:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5d306fc4",
   "metadata": {},
   "outputs": [],
   "source": [
    "numeric_features = ['SepalLength[cm]', 'SepalWidth[cm]', 'PetalLength[cm]', 'PetalWidth[cm]']\n",
    "\n",
    "numeric_transformer = Pipeline(steps=[\n",
    "    ('scaler', StandardScaler()),\n",
    "    ('feature_extraction', PCA(n_components=2))])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8a9acf24",
   "metadata": {},
   "source": [
    "- Above, we weren't interested in performing these preprocessing steps on the categorical feature(s); instead, we apply **different** preprocessing steps to the categorical variable like so:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "8834cbee",
   "metadata": {},
   "outputs": [],
   "source": [
    "categorical_features = ['Color_IMadeThisUp']\n",
    "categorical_transformer = Pipeline(steps=[\n",
    "    ('onehot', OneHotEncoder(drop='first'))])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d7313c74",
   "metadata": {},
   "source": [
    "- Scikit-learn's `ColumnTransformer` now allows us to merge these 2 seperate preprocessing pipelines, which operate on different feature sets in our dataset:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "cb0420d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "preprocessor = ColumnTransformer(\n",
    "    transformers=[\n",
    "        ('num', numeric_transformer, numeric_features),\n",
    "        ('cat', categorical_transformer, categorical_features)])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e6d43e93",
   "metadata": {},
   "source": [
    "- As a result, we get a 4 dimensional feature array (design matrix) if we apply this preprocessor. What are these 4 columns?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "6ad9ba2f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150, 4)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "temp = preprocessor.fit_transform(X)\n",
    "temp.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "c14cbcfa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-2.26454173,  0.5057039 ,  1.        ,  0.        ],\n",
       "       [-2.0864255 , -0.65540473,  1.        ,  0.        ],\n",
       "       [-2.36795045, -0.31847731,  1.        ,  0.        ],\n",
       "       [-2.30419716, -0.57536771,  1.        ,  0.        ],\n",
       "       [-2.38877749,  0.6747674 ,  1.        ,  0.        ]])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "temp[:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c48e9fda",
   "metadata": {},
   "source": [
    "- The preprocessor can now also be conveniently be used in a Scikit-learn pipeline as shown below:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "139ddc0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, \n",
    "                                                    test_size=0.2,\n",
    "                                                    random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "f24ee04a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test accuracy: 100.0%\n"
     ]
    }
   ],
   "source": [
    "clf_pipe = Pipeline(steps=[('preprocessor', preprocessor),\n",
    "                           ('classifier', KNeighborsClassifier(p=3))])\n",
    "\n",
    "\n",
    "clf_pipe.fit(X_train, y_train)\n",
    "print(f'Test accuracy: {clf_pipe.score(X_test, y_test)*100}%')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "45fa00fc-3185-4ca6-a370-0af58c65b83c",
   "metadata": {},
   "source": [
    "## Displaying Pipelines"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "94a6be01-bb29-4d50-84d1-1e12ffc07ce0",
   "metadata": {},
   "source": [
    "More info here: https://scikit-learn.org/dev/auto_examples/miscellaneous/plot_pipeline_display.html#sphx-glr-auto-examples-miscellaneous-plot-pipeline-display-py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "19f4e3f5-00ef-4e1e-9fa4-6c4e84c2681b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(steps=[('preprocessor',\n",
       "                 ColumnTransformer(transformers=[('num',\n",
       "                                                  Pipeline(steps=[('scaler',\n",
       "                                                                   StandardScaler()),\n",
       "                                                                  ('feature_extraction',\n",
       "                                                                   PCA(n_components=2))]),\n",
       "                                                  ['SepalLength[cm]',\n",
       "                                                   'SepalWidth[cm]',\n",
       "                                                   'PetalLength[cm]',\n",
       "                                                   'PetalWidth[cm]']),\n",
       "                                                 ('cat',\n",
       "                                                  Pipeline(steps=[('onehot',\n",
       "                                                                   OneHotEncoder(drop='first'))]),\n",
       "                                                  ['Color_IMadeThisUp'])])),\n",
       "                ('classifier', KNeighborsClassifier(p=3))])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf_pipe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "d4f2055d-a249-4556-a303-83793572a86c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f {color: black;background-color: white;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f pre{padding: 0;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-toggleable {background-color: white;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-estimator:hover {background-color: #d4ebff;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 2em;bottom: 0;left: 50%;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-item {z-index: 1;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-parallel::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 2em;bottom: 0;left: 50%;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-parallel-item {display: flex;flex-direction: column;position: relative;background-color: white;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-parallel-item:only-child::after {width: 0;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;position: relative;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-label label {font-family: monospace;font-weight: bold;background-color: white;display: inline-block;line-height: 1.2em;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-label-container {position: relative;z-index: 2;text-align: center;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-4c2c350c-6053-47e4-ae26-d6ebcb1ace2f\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
       "                 ColumnTransformer(transformers=[(&#x27;num&#x27;,\n",
       "                                                  Pipeline(steps=[(&#x27;scaler&#x27;,\n",
       "                                                                   StandardScaler()),\n",
       "                                                                  (&#x27;feature_extraction&#x27;,\n",
       "                                                                   PCA(n_components=2))]),\n",
       "                                                  [&#x27;SepalLength[cm]&#x27;,\n",
       "                                                   &#x27;SepalWidth[cm]&#x27;,\n",
       "                                                   &#x27;PetalLength[cm]&#x27;,\n",
       "                                                   &#x27;PetalWidth[cm]&#x27;]),\n",
       "                                                 (&#x27;cat&#x27;,\n",
       "                                                  Pipeline(steps=[(&#x27;onehot&#x27;,\n",
       "                                                                   OneHotEncoder(drop=&#x27;first&#x27;))]),\n",
       "                                                  [&#x27;Color_IMadeThisUp&#x27;])])),\n",
       "                (&#x27;classifier&#x27;, KNeighborsClassifier(p=3))])</pre><b>Please rerun this cell to show the HTML repr or trust the notebook.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"7463497d-c0a5-43e7-9ede-5310f09f4047\" type=\"checkbox\" ><label for=\"7463497d-c0a5-43e7-9ede-5310f09f4047\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">Pipeline</label><div class=\"sk-toggleable__content\"><pre>Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
       "                 ColumnTransformer(transformers=[(&#x27;num&#x27;,\n",
       "                                                  Pipeline(steps=[(&#x27;scaler&#x27;,\n",
       "                                                                   StandardScaler()),\n",
       "                                                                  (&#x27;feature_extraction&#x27;,\n",
       "                                                                   PCA(n_components=2))]),\n",
       "                                                  [&#x27;SepalLength[cm]&#x27;,\n",
       "                                                   &#x27;SepalWidth[cm]&#x27;,\n",
       "                                                   &#x27;PetalLength[cm]&#x27;,\n",
       "                                                   &#x27;PetalWidth[cm]&#x27;]),\n",
       "                                                 (&#x27;cat&#x27;,\n",
       "                                                  Pipeline(steps=[(&#x27;onehot&#x27;,\n",
       "                                                                   OneHotEncoder(drop=&#x27;first&#x27;))]),\n",
       "                                                  [&#x27;Color_IMadeThisUp&#x27;])])),\n",
       "                (&#x27;classifier&#x27;, KNeighborsClassifier(p=3))])</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"f59817d4-bb5a-46db-80eb-5ec72c66126e\" type=\"checkbox\" ><label for=\"f59817d4-bb5a-46db-80eb-5ec72c66126e\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">preprocessor: ColumnTransformer</label><div class=\"sk-toggleable__content\"><pre>ColumnTransformer(transformers=[(&#x27;num&#x27;,\n",
       "                                 Pipeline(steps=[(&#x27;scaler&#x27;, StandardScaler()),\n",
       "                                                 (&#x27;feature_extraction&#x27;,\n",
       "                                                  PCA(n_components=2))]),\n",
       "                                 [&#x27;SepalLength[cm]&#x27;, &#x27;SepalWidth[cm]&#x27;,\n",
       "                                  &#x27;PetalLength[cm]&#x27;, &#x27;PetalWidth[cm]&#x27;]),\n",
       "                                (&#x27;cat&#x27;,\n",
       "                                 Pipeline(steps=[(&#x27;onehot&#x27;,\n",
       "                                                  OneHotEncoder(drop=&#x27;first&#x27;))]),\n",
       "                                 [&#x27;Color_IMadeThisUp&#x27;])])</pre></div></div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"1e867236-b0c0-4e7e-b1d8-6ba0062766d7\" type=\"checkbox\" ><label for=\"1e867236-b0c0-4e7e-b1d8-6ba0062766d7\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">num</label><div class=\"sk-toggleable__content\"><pre>[&#x27;SepalLength[cm]&#x27;, &#x27;SepalWidth[cm]&#x27;, &#x27;PetalLength[cm]&#x27;, &#x27;PetalWidth[cm]&#x27;]</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"2bb8c934-6a90-44be-8a3c-a8d227144372\" type=\"checkbox\" ><label for=\"2bb8c934-6a90-44be-8a3c-a8d227144372\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">StandardScaler</label><div class=\"sk-toggleable__content\"><pre>StandardScaler()</pre></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"ca605b78-5030-43bf-a980-76766c1d60f3\" type=\"checkbox\" ><label for=\"ca605b78-5030-43bf-a980-76766c1d60f3\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">PCA</label><div class=\"sk-toggleable__content\"><pre>PCA(n_components=2)</pre></div></div></div></div></div></div></div></div><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"a4a8bb8a-e41c-494d-b37d-ff71219fd1a7\" type=\"checkbox\" ><label for=\"a4a8bb8a-e41c-494d-b37d-ff71219fd1a7\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">cat</label><div class=\"sk-toggleable__content\"><pre>[&#x27;Color_IMadeThisUp&#x27;]</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"9b5c6810-3fb2-4426-87bd-3b82d20787a8\" type=\"checkbox\" ><label for=\"9b5c6810-3fb2-4426-87bd-3b82d20787a8\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">OneHotEncoder</label><div class=\"sk-toggleable__content\"><pre>OneHotEncoder(drop=&#x27;first&#x27;)</pre></div></div></div></div></div></div></div></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"1a359085-7ea9-4973-9a38-0efd1dde8a41\" type=\"checkbox\" ><label for=\"1a359085-7ea9-4973-9a38-0efd1dde8a41\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">KNeighborsClassifier</label><div class=\"sk-toggleable__content\"><pre>KNeighborsClassifier(p=3)</pre></div></div></div></div></div></div></div>"
      ],
      "text/plain": [
       "Pipeline(steps=[('preprocessor',\n",
       "                 ColumnTransformer(transformers=[('num',\n",
       "                                                  Pipeline(steps=[('scaler',\n",
       "                                                                   StandardScaler()),\n",
       "                                                                  ('feature_extraction',\n",
       "                                                                   PCA(n_components=2))]),\n",
       "                                                  ['SepalLength[cm]',\n",
       "                                                   'SepalWidth[cm]',\n",
       "                                                   'PetalLength[cm]',\n",
       "                                                   'PetalWidth[cm]']),\n",
       "                                                 ('cat',\n",
       "                                                  Pipeline(steps=[('onehot',\n",
       "                                                                   OneHotEncoder(drop='first'))]),\n",
       "                                                  ['Color_IMadeThisUp'])])),\n",
       "                ('classifier', KNeighborsClassifier(p=3))])"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn import set_config\n",
    "\n",
    "\n",
    "set_config(display=\"diagram\")\n",
    "clf_pipe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c80e71bc-4d7d-4ea2-8545-31aa846ae1ee",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
